Riemannian metric from manifold learning

library(tidyverse)
library(dimRed)
library(reticulate)
library(viridis)
library(hdrcde)
library(igraph)
library(matrixcalc)
library(akima)
library(car)
Jmisc::sourceAll(here::here("R/sources"))
set.seed(123)

Smart meter data for one household

# load(here::here("data/spdemand_3639id336tow.rda"))
# nid <- 1
# ntow <- 336
# 
# train <- spdemand %>%
#   lazy_dt() %>%
#   filter(tow <= ntow,
#          # id <= sort(unique(spdemand[,id]))[nid],
#          id == 1003) %>%
#   dplyr::select(-id, -tow) %>%
#   data.table::as.data.table()
train <- readRDS(here::here("data/spdemand_1id336tow_train.rds"))

Metric learning

Radius searching with k-d trees

# Parameters fixed
x <- train
N <- nrow(train)
s <- 2
k <- 20
method <- "annIsomap"
annmethod <- "kdtree"
distance <- "euclidean"
treetype <- "kd"
searchtype <- "radius" # change searchtype for radius search based on `radius`, or KNN search based on `k`
radius <- .4 # the bandwidth parameter, \sqrt(\elsilon), as in algorithm

The metric learning algorithm

metric_isomap <- metricML(x, s = s, k = k, radius = radius, method = method, annmethod = annmethod, eps = 0, distance = distance, treetype = treetype, searchtype = searchtype, invert.h = TRUE)
summary(metric_isomap)
##                Length Class     Mode   
## embedding         672 -none-    numeric
## rmetric          1344 -none-    numeric
## weighted_graph     10 igraph    list   
## adj_matrix     112896 dgCMatrix S4     
## laplacian      112896 dgCMatrix S4

The function metricML() returns a list of

  • the embedding coordinates embedding of \(N \times s\)
  • the embedding metric rmetric for each point \(p \in x\) as an array
  • the weighted neighborhood graph as an igraph object weighted_graph
  • the sparse adjacency matrix from the graph adj_matrix
  • the graph laplacian matrix laplacian

Check the output for each of the four steps

The metric learning algorithm consists of four steps:

  1. construct a weighted neighborhood graph weighted_graph
  2. calculate the graph laplacian laplacian
  3. map the data \(p \in x\) to the embedding coordinates embedding
  4. apply the graph laplacian to the embedding corrdinates to obtain the embedding metric rmetric

Based on the output from megaman package, I compared the R function metricML() output for each of the four steps.

Step1: weighted graph

We compare the graph by using the weighted adjacency matrix.

metric_isomap$adj_matrix[1:6, 1:6]
## 6 x 6 sparse Matrix of class "dgCMatrix"
##                                                                 
## [1,] 1.0000000 0.7403658 0.4893711 .         .         .        
## [2,] 0.7403658 1.0000000 0.8251851 0.6199676 0.4515039 0.3956647
## [3,] 0.4893711 0.8251851 1.0000000 0.8651579 0.6997442 0.5946050
## [4,] .         0.6199676 0.8651579 1.0000000 0.9248341 0.7988909
## [5,] .         0.4515039 0.6997442 0.9248341 1.0000000 0.8770268
## [6,] .         0.3956647 0.5946050 0.7988909 0.8770268 1.0000000
aff <-
  feather::read_feather(here::here("data/affinity_matrix_1id336tow.feather"))
aff[1:6, 1:6]
## # A tibble: 6 x 6
##     `0`   `1`   `2`   `3`   `4`   `5`
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1     0.740 0.489 0     0     0    
## 2 0.740 1     0.825 0.620 0.452 0.396
## 3 0.489 0.825 1     0.865 0.700 0.595
## 4 0     0.620 0.865 1     0.925 0.799
## 5 0     0.452 0.700 0.925 1     0.877
## 6 0     0.396 0.595 0.799 0.877 1
all.equal(as.matrix(aff), as.matrix(metric_isomap$adj_matrix), tolerance = 1e-5, check.attributes = F)
## [1] TRUE

Step2: Laplacian matrix

Ln <- metric_isomap$laplacian
Ln[1:6, 1:6]
## 6 x 6 sparse Matrix of class "dgCMatrix"
##                                                                             
## [1,] -23.8073282   0.6096452   0.2482634   .           .           .        
## [2,]   0.8217472 -24.2336948   0.3895801   0.2092046   0.1386494   0.1262752
## [3,]   0.4589835   0.5343428 -24.6010558   0.2466973   0.1815778   0.1603563
## [4,]   .           0.3379937   0.2905886 -24.7599288   0.2020497   0.1813911
## [5,]   .           0.2316924   0.2212245   0.2089848 -24.7943612   0.1874356
## [6,]   .           0.2083193   0.1928744   0.1852211   0.1850419 -24.7807239
lap <-
  feather::read_feather(here::here("data/laplacian_matrix_1id336tow.feather"))
lap[1:6, 1:6]
## # A tibble: 6 x 6
##       `0`     `1`     `2`     `3`     `4`     `5`
##     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 -23.8     0.610   0.248   0       0       0    
## 2   0.822 -24.2     0.390   0.209   0.139   0.126
## 3   0.459   0.534 -24.6     0.247   0.182   0.160
## 4   0       0.338   0.291 -24.8     0.202   0.181
## 5   0       0.232   0.221   0.209 -24.8     0.187
## 6   0       0.208   0.193   0.185   0.185 -24.8
all.equal(as.matrix(lap), as.matrix(Ln), tolerance = 1e-5, check.attributes = F)
## [1] TRUE

Step3: Embedding coordinates

fn <- metric_isomap$embedding
fn %>% head()
##               E1        E2
## [1,]  0.07615319 0.4229855
## [2,] -0.09854840 0.3503061
## [3,] -0.21017105 0.2766018
## [4,] -0.29429931 0.1997070
## [5,] -0.31819209 0.1549928
## [6,] -0.34529194 0.1165973
emb_isomap <- feather::read_feather(here::here("data/embedding_isomap_1id336tow.feather"))
emb_isomap %>% head()
## # A tibble: 6 x 2
##       `0`    `1`
##     <dbl>  <dbl>
## 1  0.0725 -0.465
## 2 -0.105  -0.407
## 3 -0.204  -0.290
## 4 -0.307  -0.186
## 5 -0.354  -0.122
## 6 -0.398  -0.114
all.equal(as.matrix(emb_isomap), as.matrix(fn), tolerance = 1e-5, check.attributes = F)
## [1] "Mean relative difference: 0.8138705"
par(mfrow = c(1, 2))
plot(fn, col = viridis::viridis(24), asp=1) 
plot(emb_isomap$`0`, emb_isomap$`1`, col = viridis::viridis(24), asp=1)

Step4: Riemannian metric

In metricML(), the invert.h argument controls whether the inverse of Riemannian metric should be returned. FALSE by default.

The megaman package returns the inverse of Riemannian matrix.

# Rn <- riemann_metric(Y = fn, laplacian = Ln, d = s, invert.h = T)
Rn <- metric_isomap$rmetric
Rn[,,1:3]
## , , 1
## 
##           [,1]      [,2]
## [1,] 0.4378822 0.0594542
## [2,] 0.0594542 0.1081907
## 
## , , 2
## 
##           [,1]      [,2]
## [1,] 0.5268865 0.1531055
## [2,] 0.1531055 0.2067521
## 
## , , 3
## 
##           [,1]      [,2]
## [1,] 0.4775869 0.1744485
## [2,] 0.1744485 0.3813567
np = import("numpy")
H_isomap <- np$load(here::here("data/hmatrix_isomap_1id336tow.npy"))
H_isomap[1,,, drop=TRUE] %>% 
  matrixcalc::is.positive.definite()
## [1] TRUE
pyh_isomap <- array(NA, dim = c(s, s, N))
for(i in 1:N){
  pyh_isomap[,,i] <- H_isomap[i,,]
}
pyh_isomap[,,1:3] # inverted
## , , 1
## 
##             [,1]        [,2]
## [1,]  0.37657276 -0.00438684
## [2,] -0.00438684  0.17479710
## 
## , , 2
## 
##            [,1]       [,2]
## [1,]  0.5200775 -0.1136964
## [2,] -0.1136964  0.2999303
## 
## , , 3
## 
##            [,1]       [,2]
## [1,]  0.4890490 -0.1653728
## [2,] -0.1653728  0.4545575
all.equal(pyh_isomap, Rn, tolerance = 1e-5, check.attributes = F)
## [1] "Mean relative difference: 0.5528692"
# If we use the embedding from Python, the inverse of Riem metric is the same. 
hn <- riemann_metric(Y = as.matrix(emb_isomap), laplacian = Ln, d = s, invert.h = T)
hn[,,1:3]
## , , 1
## 
##             [,1]        [,2]
## [1,]  0.37657276 -0.00438684
## [2,] -0.00438684  0.17479710
## 
## , , 2
## 
##            [,1]       [,2]
## [1,]  0.5200775 -0.1136964
## [2,] -0.1136964  0.2999303
## 
## , , 3
## 
##            [,1]       [,2]
## [1,]  0.4890490 -0.1653728
## [2,] -0.1653728  0.4545575
all.equal(pyh_isomap, hn, tolerance = 1e-5, check.attributes = F) # use python embedding
## [1] TRUE

Finally, the R function metricML() gives the same/close outputs as the Python megaman package.

Ellipse plot

Using the Riemannian metric for each point as the 2-d covariance matrix and the 2-d embeddings as the centroid to plot an ellipse for each point.

The underlying requirement is that the Riemannian metric needs to be a square positive definite matrix, i.e. matrixcalc::is.positive.definite() returns TRUE.

# TODO: if add = FALSE, fix xlim, ylim, radius for generating ellipse
# TODO: add argument `n.plot` to specify how many ellipse to be added
x <- metric_isomap$embedding
cols <- viridis::viridis(24) 
plot(metric_isomap$embedding, col = cols, asp=1, pch = 20)
for(i in 1:N){
  mat <- Rn[,,i] # pyh_isomap[,,I]
  center <- c(x[i,1], x[i,2])
  # add <- ifelse(i == 1, F, T)
  add <- T
  car::ellipse(center, mat, 
               radius = .1, # controls the size of ellipse, currently set manually
               # col = cols[i], 
               col = blues9[5],
               asp = 1, pty = "s", lwd = 1, center.pch = 19, center.cex = 0,
               fill = T, fill.alpha = 0.2, add = add, grid = T,
               # xlim = c(-.2, .25), ylim = c(-.2, .2)
               )
}

Contour plots

Variable kernel density estimate

For multivariate data, the variable kernel density estimate is given by \[\hat{f}(x) = n^{-1} \sum_i K_H (x - X_i).\]

Inverse of Riemannian metric as bandwidth matrix

If we use the embedding x and the inverse of Riemannian metric h, we could produce contour plots based on the density estimates.

x <- cbind(emb_isomap$`0`, emb_isomap$`1`)
f <- mkde(x = x, h = pyh_isomap)
## Time difference of 9.785535 secs
summary(f)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1160  0.2240  0.3087  0.3029  0.3916  0.4828
# Top 20 index with the lowest densities
head(order(f), n=20)
##  [1]  39 328 327 182 183 136 329 135  40  38 134 278 282 133 326  86 279 231 137
## [20] 281

Now apply the mkde() function to the output of metricML() written in R.

x <- metric_isomap$embedding
riem_isomap <- metric_isomap$rmetric
f <- mkde(x = x, h = riem_isomap)
## Time difference of 7.821089 secs
summary(f)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.09451 0.20224 0.30641 0.28375 0.36464 0.47392
head(order(f), n=20)
##  [1]  39 328 327 183 182 135 282 278 329  40 134 136  38 326 231  86 133 281 137
## [20] 279

We can check the outliers based on density f. In R, the outliers are similar to the megaman output.

Riemannian metric itself as bandwidth matrix

R_isomap <- np$load(here::here("data/rmatrix_isomap_1id336tow.npy"))
# R_isomap[1,,, drop=TRUE] %>% 
#   matrixcalc::is.positive.definite() # not necessarily symmetric
pyr_isomap <- array(NA, dim = c(s, s, N))
for(i in 1:N){
  pyr_isomap[,,i] <- R_isomap[i,,]
}
pyr_isomap[,,1:3]
## , , 1
## 
##            [,1]       [,2]
## [1,] 2.65630592 0.06666465
## [2,] 0.06666465 5.72259192
## 
## , , 2
## 
##           [,1]      [,2]
## [1,] 2.0965329 0.7947453
## [2,] 0.7947453 3.6353771
## 
## , , 3
## 
##          [,1]     [,2]
## [1,] 2.331629 0.848271
## [2,] 0.848271 2.508551
x <- cbind(emb_isomap$`0`, emb_isomap$`1`)
f <- mkde(x = x, h = pyr_isomap)
## Time difference of 5.854377 secs
summary(f)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.02254 0.03947 0.04608 0.04784 0.05574 0.08985
# Top 20 index with the lowest densities
head(order(f), n=20)
##  [1] 298 296 297 300 299 199  55 200 251  56 250 249  54 248 198 104 276 152 268
## [20] 312
p_pyr <- plot.contour(x, f, plot.hdr = F)

Now apply the mkde() function to the output of metricML() written in R.

hn <- riemann_metric(Y = as.matrix(emb_isomap), laplacian = Ln, d = s, invert.h = T)
f <- mkde(x = metric_isomap$embedding, h = hn)
## Time difference of 6.570726 secs
summary(f)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1031  0.2424  0.3005  0.3044  0.3933  0.4917
head(order(f), n=20)
##  [1] 282 183 328  39 135 237  40 136 329 182 327 283 326 281  38 232  41 331 137
## [20] 134
p_rr <- plot.contour(x, f, plot.hdr = F)

t-SNE

metric_tsne <- metricML(x, s = s, k = k, radius = radius, method = "tSNE", annmethod = annmethod, eps = 0, distance = distance, treetype = treetype, searchtype = searchtype, invert.h = TRUE)
## Warning in matchPars(methodObject, list(...)): Parameter matching: knn is not a
## standard parameter, ignoring.
## Warning in matchPars(methodObject, list(...)): Parameter matching: annmethod is
## not a standard parameter, ignoring.
## Warning in matchPars(methodObject, list(...)): Parameter matching: radius is not
## a standard parameter, ignoring.
## Warning in matchPars(methodObject, list(...)): Parameter matching: eps is not a
## standard parameter, ignoring.
## Warning in matchPars(methodObject, list(...)): Parameter matching: nt is not a
## standard parameter, ignoring.
## Warning in matchPars(methodObject, list(...)): Parameter matching: nlinks is not
## a standard parameter, ignoring.
## Warning in matchPars(methodObject, list(...)): Parameter matching:
## ef.construction is not a standard parameter, ignoring.
## Warning in matchPars(methodObject, list(...)): Parameter matching: distance is
## not a standard parameter, ignoring.
## Warning in matchPars(methodObject, list(...)): Parameter matching: treetype is
## not a standard parameter, ignoring.
## Warning in matchPars(methodObject, list(...)): Parameter matching: searchtype is
## not a standard parameter, ignoring.
summary(metric_tsne)
##                Length Class     Mode   
## embedding         672 -none-    numeric
## rmetric          1344 -none-    numeric
## weighted_graph     10 igraph    list   
## adj_matrix     112896 dgCMatrix S4     
## laplacian      112896 dgCMatrix S4
f <- mkde(x = metric_tsne$embedding, h = metric_tsne$rmetric)
## Time difference of 8.543279 secs
summary(f)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0001540 0.0003175 0.0003758 0.0004688 0.0006294 0.0013705
head(order(f), n=20)
##  [1]  69 212 213 157 165  70  68 116 117 214 164 158 118  71 215 156 166 109 216
## [20]  61
plot.contour(x = metric_tsne$embedding, f, plot.hdr = F)

## $p
## NULL